# supress the display of warning messages
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text
from sklearn.model_selection import train_test_split, cross_validate,\
GridSearchCV, cross_val_score, KFold, ParameterGrid
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_fscore_support,\
accuracy_score, recall_score, precision_score, f1_score,\
confusion_matrix, classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier,\
BaggingClassifier, AdaBoostClassifier
# install imbalanced-learn package
!pip install -U imbalanced-learn
# import samplers and classifiers from imblearn
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# License: MIT
# from collections import Counter
# from sklearn.datasets import make_classification
# from sklearn.svm import LinearSVC
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import (RandomOverSampler, ADASYN,
SMOTE, BorderlineSMOTE, SVMSMOTE)
from imblearn.under_sampling import (RandomUnderSampler,
ClusterCentroids,
NearMiss,
InstanceHardnessThreshold,
CondensedNearestNeighbour,
EditedNearestNeighbours,
RepeatedEditedNearestNeighbours,
AllKNN,
NeighbourhoodCleaningRule,
OneSidedSelection)
from imblearn.combine import (SMOTEENN, SMOTETomek)
from imblearn.ensemble import (BalancedBaggingClassifier,
BalancedRandomForestClassifier,
EasyEnsembleClassifier,
RUSBoostClassifier)
# from imblearn.base import BaseSampler
# Mount the drive - Must be done each time session expires.
from google.colab import drive
drive.mount('/content/drive')
# set seed so that results are reproducible
np.random.seed(123456)
# import the dataset
df = pd.read_csv('/content/drive/My Drive/carAuction.csv')
df.info()
# null value counts for each column
print('Null value counts:', "\n", df.isnull().sum())
# distribution of the target variable
y_counts = df['IsBadBuy'].value_counts()
y_counts
# save the indices from value_counts for use in the print statement below
y_levels = y_counts.index
y_levels
# All the "yes" instances in the dataset
yes_df = df[df.IsBadBuy.str.match('Yes')]
yes_df.shape
# Randomly sample from the yeses -- yes_df1 will have 25% of the yeses, yes_df2 will have 75% of the yeses
yes_df1, yes_df2 = train_test_split(yes_df, train_size = 0.25)
yes_df1.shape
df_imb1 = df[~df.index.isin(yes_df1.index)].copy() # subtract 25% of the yeses
df_imb2 = df[~df.index.isin(yes_df2.index)].copy() # subtract 75% of the yeses
df_imb1.shape
df_imb2.shape
# distribution of the target variable in the two new datasets
y_levels = y_counts.index
y_levels
imb1_y_counts = df_imb1['IsBadBuy'].value_counts()
imb2_y_counts = df_imb2['IsBadBuy'].value_counts()
print("In the original dataset:", "\n")
for i in y_levels:
print(f"{round(y_counts[i]/len(df)*100, 2)}% is {i}", "\n")
print("\n")
print("In df_imb1:", "\n")
for i in y_levels:
print(f"{round(imb1_y_counts[i]/len(df)*100, 2)}% is {i}", "\n")
print("\n")
print("In df_imb2:", "\n")
for i in y_levels:
print(f"{round(imb2_y_counts[i]/len(df)*100, 2)}% is {i}", "\n")
# oversamplers from imblearn.over_sample
osampler_list = [RandomOverSampler(random_state=42),
ADASYN(random_state=42),
SMOTE(random_state=42),
SVMSMOTE(random_state=42),
BorderlineSMOTE(random_state=42)]
osampler_name_list = ['ROS','ADASYN','SMOTE','SVMSMOTE','BorderlineSMOTE']
# undersamplers from imblearn.under_sample
usampler_list = [RandomUnderSampler(random_state=42),
NearMiss(version=1),
NearMiss(version=2),
NearMiss(version=3),
# TomekLinks(),
EditedNearestNeighbours(),
RepeatedEditedNearestNeighbours(),
AllKNN()]
usampler_name_list = ['RUS','NearMiss1','NearMiss2','NearMiss3',
'ENN','RENN','ALLKNN']
# combined samplers from imblearn.combine
csampler_list = [SMOTEENN(random_state=42),
SMOTETomek(random_state=42)]
csampler_name_list = ['SMOTEENN', 'SMOTETomek']
# Balanced ensemble classifiers from imblearn.ensemble
bbc = BalancedBaggingClassifier(n_estimators=10,
base_estimator=DecisionTreeClassifier(),
sampling_strategy='auto',
replacement=False,
random_state=42)
# brf = BalancedRandomForestClassifier(n_estimators=100, random_state=42)
rusboost = RUSBoostClassifier(n_estimators=50, algorithm='SAMME.R',
random_state=42)
eec = EasyEnsembleClassifier(n_estimators=10, random_state=42)
be_clf_list = [bbc,rusboost,eec]
be_clf_name_list = ['Balanced_Bagging','RUSBoost','EasyEnsembleClassifier']
# five different classifiers to test out
# set hyperparameters for the first iteration of the decision tree classifier
DTC1 = DecisionTreeClassifier(criterion='entropy',max_depth=4,random_state=42)
# then list the five classifiers
clf_list1 = [DTC1, GaussianNB(),MLPClassifier(),SVC(),KNeighborsClassifier()]
clf_name_list1 = ['DTC1','NB_default','MLP_default','SVC_default','KNN_default']
# logistic regression classifiers
lr_lbfgs = LogisticRegression(random_state=42)
lr_lbfgs_lowerC = LogisticRegression(C=0.1, random_state=42)
lr_newton = LogisticRegression(solver='newton-cg',random_state=42)
lr_lib = LogisticRegression(solver='liblinear',random_state=42)
lr_lib_lowerC = LogisticRegression(C=0.1,solver='liblinear',random_state=42)
lr_lib_l1 = LogisticRegression(solver='liblinear',penalty = 'l1',random_state=42)
lr_sag = LogisticRegression(solver='sag',random_state=42)
lr_saga = LogisticRegression(solver='saga',random_state=42)
lr_list = [lr_lbfgs,lr_newton,lr_sag,lr_saga, lr_lib, lr_lib_lowerC,lr_lib_l1,]
lr_name_list = ['lbfgs_l2','newton_l2','sag_l2','saga_l2','lib_l2','lib_lowerC','lib_l1',]
# bagging classifiers
bag_list=[BaggingClassifier(random_state=42),
BaggingClassifier(n_estimators=20,random_state=42),
BaggingClassifier(base_estimator=SVC(),random_state=42),
BaggingClassifier(base_estimator=LogisticRegression(),random_state=42)]
bag_name_list=['Bagging_default','Bagging_20','Bagging_SVC','Bagging_lr']
# boosting classifiers
boost_list = [AdaBoostClassifier(), AdaBoostClassifier(learning_rate=0.5),
AdaBoostClassifier(base_estimator = LogisticRegression(), n_estimators=15,random_state=42),
AdaBoostClassifier(base_estimator = LogisticRegression(), n_estimators=15,learning_rate=0.5,random_state=42)]
boost_name_list = ['Ada_default','Ada_dt_halflearning','Ada_lr_15','Ada_lr_15_halflearning']
# random forest classifiers
rf_list = [RandomForestClassifier(random_state=42),
RandomForestClassifier(criterion='entropy',random_state=42),
RandomForestClassifier(n_estimators=50,random_state=42),
RandomForestClassifier(criterion='entropy',n_estimators=50,random_state=42),
RandomForestClassifier(max_depth=7,random_state=42),
RandomForestClassifier(criterion='entropy',max_depth=7,random_state=42)]
rf_name_list = ['rf_default','rf_entropy_default','rf_50','rf_entropy_50','rf_max7','rf_entropy_max7']
# a combination of the classifiers above with a variety of different hyperparameters
lr_lib_l1_lowerc = LogisticRegression(C=0.1,solver='liblinear',penalty = 'l1', random_state=42)
Ada_lr_15_halflearn = AdaBoostClassifier(base_estimator = LogisticRegression(), n_estimators=15,learning_rate=0.5,random_state=42)
rf_max7 = RandomForestClassifier(max_depth=7,random_state=42)
Bagging_20 = BaggingClassifier(n_estimators=20,random_state=42)
clf_list4 = [DTC1,SVC(),
lr_lib_l1_lowerc,
Bagging_20,
Ada_lr_15_halflearn,
rf_max7]
clf_name_list4 = ['DTC1','SVC_default','lr_lib_l1_lowerc','Bagging_20','Ada_lr_15_halflearn','rf_max7']
def fun_enc(df, df_name):
# separate target variable and features
y_df = df['IsBadBuy'].copy()
X_df = df.drop('IsBadBuy', axis=1).copy()
# separate numeric and catogorical features
X_num_df = X_df.select_dtypes(exclude=['object']).copy()
X_cat_df = X_df.select_dtypes(include=['object']).copy()
X_cat_list = X_cat_df.columns.tolist()
X_col_list = X_cat_list + X_num_df.columns.tolist()
enc_col_df_list =[] # list of encoded dataframes for use in pd.concat()
enc_col_counts = [] # list of number of encoded features per original feature
# encode the categorical features
for i in X_cat_list:
dummy_df = pd.get_dummies(X_cat_df[i]) # create an encoded dataframe for each categorical feature
enc_col_counts.append(dummy_df.shape[1]) # add the number of columns from the dataframe the list of counts
enc_col_df_list.append(dummy_df) # add the newly encoded dataframe to the list of dataframes
# add in the numeric features
for i in X_num_df:
enc_col_counts.append(1) # each numeric feature only needs one dummy variable
enc_col_df_list.append(df[i]) # add the numeric columns to the final list of encoded columns
X_enc_df = pd.concat(enc_col_df_list,axis=1) # concatenate the list of dataframes
print(f'In {df_name}: the count of new columns created from each original column:\n',enc_col_counts)
return X_enc_df, y_df, X_col_list, enc_col_counts
X_df, y_df, X_col_list, remove_col_counts = fun_enc(df, 'df')
X_df1, y_df1, X_col_list1, remove_col_counts1 = fun_enc(df_imb1, 'df_imb1')
X_df2, y_df2, X_col_list2, remove_col_counts2 = fun_enc(df_imb2, 'df_imb2')
def fun_split(df_name, X_df, y_df, train_pct, val_pct):
# split the predictors and the target data frame into test (1- train_pct) and
# train (train_pct) dataframes using the target data frame
X_train, X_val_test, y_train, y_val_test = \
train_test_split(X_df, y_df, train_size=train_pct, random_state=42)
# split again using train_size=val_pct
X_val, X_test, y_val, y_test = \
train_test_split(X_val_test, y_val_test, train_size=val_pct, random_state=42)
# print the shapes of the three divisions
print(f'In {df_name}:')
print('\n')
print(f'Shape of X_train:', X_train.shape)
print(f'Shape of y_train:', y_train.shape)
print('\n')
print(f'Shape of X_val:', X_val.shape)
print(f'Shape of y_val:', y_val.shape)
print('\n')
print(f'Shape of X_test', X_test.shape)
print(f'Shape of y_test', y_test.shape)
print('\n')
# Generate y_counts in train, val and test sets
train_y_counts = y_train.value_counts()
val_y_counts = y_val.value_counts()
test_y_counts = y_test.value_counts()
# print y distribution in percentage for train, val and test sets
print(f'In {df_name} train set:','\n')
for i in y_levels:
print(f'{round(100*train_y_counts[i]/X_train.shape[0],2)} percent is {i}','\n')
print('\n')
print(f'In {df_name} validation set:','\n')
for i in y_levels:
print(f'{round(100*val_y_counts[i]/X_val.shape[0],2)} percent is {i}','\n')
print('\n')
print(f'In {df_name} test set:','\n')
for i in y_levels:
print(f'{round(100*test_y_counts[i]/X_test.shape[0],2)} percent is {i}','\n')
return X_train, y_train, X_val, y_val, X_test, y_test;
print('\n')
# split the original df
X_train, y_train, X_val, y_val, X_test, y_test = fun_split('df', X_df, y_df, 0.6, 0.5)
# split df_imb1
X_train1, y_train1, X_val1, y_val1, X_test1, y_test1 = fun_split('df_imb1', X_df1, y_df1, 0.6, 0.5)
# split df_imb2
X_train2, y_train2, X_val2, y_val2, X_test2, y_test2 = fun_split('df_imb2', X_df2, y_df2, 0.6, 0.5)
Don't resample the validation or test sets. Always evaluate on actual, non-resampled data.
# resample the train set from the original data (don't resample validate or test sets)
sm = SMOTE(random_state=42)
smk3 = SMOTE(k_neighbors=3, random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train.values, y_train)
print('Shapes of X_train_sm and y_train_sm:',X_train_sm.shape, y_train_sm.shape,'\n')
print('Target variable instance counts:\n', pd.Series(y_train_sm).value_counts())
# resample the train sets of the two new datasets that are more imbalanced
X_train1_sm, y_train1_sm =sm.fit_resample(X_train1.values, y_train1)
X_train2_sm, y_train2_sm =sm.fit_resample(X_train2.values, y_train2)
print('Shapes of X_train1_sm and y_train1_sm:',X_train1_sm.shape, y_train1_sm.shape,'\n')
print('Instance counts by values in target variable:\n', pd.Series(y_train1_sm).value_counts())
print('\n')
print('Shapes of X_train2_sm and y_train2_sm:', X_train2_sm.shape, y_train2_sm.shape,'\n')
print('Target variable instance counts:\n', pd.Series(y_train2_sm).value_counts())
X_train_smk3, y_train_smk3 = smk3.fit_resample(X_train.values, y_train)
X_train1_smk3, y_train1_smk3 = smk3.fit_resample(X_train1.values, y_train1)
X_train2_smk3, y_train2_smk3 = smk3.fit_resample(X_train2.values, y_train2)
print('Shapes of X_train_smk3 and y_train_smk3:', X_train1_smk3.shape, y_train_smk3.shape,'\n')
print('Target variable instance counts:\n', pd.Series(y_train1_smk3).value_counts())
print('\n')
print('Shapes of X_train1_smk3 and y_train1_smk3:', X_train1_smk3.shape, y_train1_smk3.shape,'\n')
print('Target variable instance counts:\n', pd.Series(y_train1_smk3).value_counts())
print('\n')
print('Shapes of X_train2_smk3 and y_train2_smk3:', X_train2_smk3.shape, y_train2_smk3.shape,'\n')
print('Target variable instance counts:\n', pd.Series(y_train2_smk3).value_counts())
# func for fitting a model on the train set, predicting on the validation and test sets, and reporting the results
def fun_clf_split_val_test4(clf, clf_name, X_train, y_train, X_val, y_val, X_test, y_test):
# fit an estimator to train data
model = clf.fit(X_train,y_train)
# model_name = type(model).__name__
# generate predictions for the target of the validation set
pred_val = model.predict(X_val)
# create a report on the validation set performance
val_report_dict = metrics.classification_report(y_val, pred_val, labels=y_levels, output_dict=True)
val_report_df = round(pd.DataFrame.from_dict(val_report_dict).reset_index(drop=False),2)
val_report_df['clf_name'] = clf_name
# generate predictions for the target of the test set
pred_test = model.predict(X_test)
# create a report on the test set performance
test_report_dict = metrics.classification_report(y_test, pred_test, labels=y_levels,output_dict=True)
test_report_df = round(pd.DataFrame.from_dict(test_report_dict).reset_index(drop=False),2)
test_report_df['clf_name'] = clf_name
# bundle up validation and test results in a single row
split_val_test_results_df = pd.DataFrame({'clf_name':[clf_name],'test_results':[test_report_df],'validate_results':[val_report_df]})
return split_val_test_results_df
# run the function on the original dataset that has been resampled
sm_val_test_results_df = fun_clf_split_val_test4(DTC1,'DTC1',X_train_sm,y_train_sm,X_val,y_val,X_test,y_test)
smk3_val_test_results_df = fun_clf_split_val_test4(DTC1,'DTC1',X_train_smk3,y_train_smk3,X_val,y_val,X_test,y_test)
print('Validation results using smote on df:\n', sm_val_test_results_df['validate_results'].iloc[0],'\n')
print('Validation results using smote with 3 neighbors on df:\n', sm_val_test_results_df['validate_results'].iloc[0],'\n')
print('Test results using smote on df::\n', sm_val_test_results_df['test_results'].iloc[0],'\n')
print('Test results using smote with 3 neighbors on df1:\n', smk3_val_test_results_df['test_results'].iloc[0],'\n')
# run the function on the first "new" imbalanced dataset that has been resampled
sm_val_test_results_df1 = fun_clf_split_val_test4(DTC1,'DTC1',X_train1_sm,y_train1_sm,X_val1,y_val1,X_test1,y_test1)
smk3_val_test_results_df1 = fun_clf_split_val_test4(DTC1,'DTC1',X_train1_smk3,y_train1_smk3,X_val1,y_val1,X_test1,y_test1)
print('Validation results using smote on df1:\n', sm_val_test_results_df1['validate_results'].iloc[0],'\n')
print('Validation results using smote with 3 neighbors on df1:\n', smk3_val_test_results_df1['validate_results'].iloc[0],'\n')
print('Test results using smote on df1:\n',sm_val_test_results_df1['test_results'].iloc[0],'\n')
print('Test results using smote with 3 neighbors on df1:\n',smk3_val_test_results_df1['test_results'].iloc[0],'\n')
# run the function on the second "new" imbalanced dataset that has been resampled
sm_val_test_results_df2 = fun_clf_split_val_test4(DTC1, 'DTC1', X_train2_sm, y_train2_sm, X_val2, y_val2, X_test2, y_test2)
smk3_val_test_results_df2 = fun_clf_split_val_test4(DTC1, 'DTC1', X_train2_smk3, y_train2_smk3, X_val2, y_val2, X_test2, y_test2)
print('Validation results using smote on df2:\n', sm_val_test_results_df2['validate_results'].iloc[0],'\n')
print('Validation results using smote with 3 neighbors on df2:\n', smk3_val_test_results_df2['validate_results'].iloc[0],'\n')
print('Test results using smote on df2:\n',sm_val_test_results_df2['test_results'].iloc[0],'\n')
print('Test results using smote with 3 neighbors on df2:\n',smk3_val_test_results_df2['test_results'].iloc[0],'\n')
# func for fitting multiple models on the train set, predicting on the validation and test sets, and reporting the results
def fun_split_val_multi_clf(clf_list,clf_name_list,X_train,y_train,X_val,y_val,X_test,y_test):
multi_clf_results_list = []
for i in range(0,len(clf_list)):
clf_results_df = fun_clf_split_val_test4(clf_list[i],clf_name_list[i],X_train,y_train,X_val, y_val,X_test,y_test)
multi_clf_results_list.append(clf_results_df)
# force pd.concat() to create a good index
split_val_multi_clf_results_df = pd.concat(multi_clf_results_list).reset_index(drop=True)
return split_val_multi_clf_results_df
# Run cv_val_multi_clf() for resampled training data, clf_list1 and clf_name_list1
sm_multi_clf_results_df = fun_split_val_multi_clf(clf_list1, clf_name_list1,X_train_sm,y_train_sm,X_val,y_val,X_test,y_test)
sm_multi_clf_results_df1 = fun_split_val_multi_clf(clf_list1, clf_name_list1,X_train1_sm,y_train1_sm,X_val1,y_val1,X_test1,y_test1)
smk3_multi_clf_results_df2 = fun_split_val_multi_clf(clf_list1, clf_name_list1,X_train2_smk3,y_train2_smk3,X_val2,y_val2,X_test2,y_test2)
# create a function for combining all the validation reports and combining test reports for each classifier
def fun_multi_clf_reports(split_val_multi_clf_results_df):
multi_clf_val_report_list = []
multi_clf_test_report_list = []
for index, model_row in split_val_multi_clf_results_df.iterrows():
multi_clf_val_report_list.append(model_row['validate_results'])
multi_clf_test_report_list.append(model_row['test_results'])
# end of for loop
multi_clf_val_report_df = pd.concat(multi_clf_val_report_list)
multi_clf_val_report_df = multi_clf_val_report_df.reset_index(drop=False)
multi_clf_val_report_df = multi_clf_val_report_df.rename(columns={"index": "scorer"})
multi_clf_test_report_df = pd.concat(multi_clf_test_report_list)
multi_clf_test_report_df = multi_clf_test_report_df.reset_index(drop=False)
multi_clf_test_report_df = multi_clf_test_report_df.rename(columns={"index": "scorer"})
return multi_clf_val_report_df, multi_clf_test_report_df
# run the "combine reports" function for each of the three variations of data
sm_multi_clf_val_report_df, sm_multi_clf_test_report_df = fun_multi_clf_reports(sm_multi_clf_results_df)
sm_multi_clf_val_report_df1, sm_multi_clf_test_report_df1 = fun_multi_clf_reports(sm_multi_clf_results_df1)
smk3_multi_clf_val_report_df2, smk3_multi_clf_test_report_df2 = fun_multi_clf_reports(smk3_multi_clf_results_df2)
# create a function for generating tables and graphs comparing validation and test performance
def fun_split_val_test_comparison4(sam_name,clf_name_df,multi_clf_val_report_df,multi_clf_test_report_df,comp_name):
# show tabular comparisons and sns.catplot() of
# validation and test results by metric
for i in range(0,len(metric_map.x_label)):
val_metric_df = multi_clf_val_report_df[multi_clf_val_report_df['scorer'].\
str.match(metric_map.scorer.iloc[i])].reset_index(drop=True)[metric_map.col.iloc[i]]
test_metric_df = multi_clf_test_report_df[multi_clf_test_report_df['scorer'].\
str.match(metric_map.scorer.iloc[i])].reset_index(drop=True)[metric_map.col.iloc[i]]
wide_df = pd.concat([clf_name_df, test_metric_df,val_metric_df],axis=1)
# rename the columns
wide_df.columns = ['clf_name','test_set_result','val_set_result']
wide_df=wide_df.sort_values(by=['test_set_result'], ascending=False)
print(metric_map.x_label.iloc[i],'from test (left) and from validation (right)\n')
print(wide_df,'\n')
long_df = pd.DataFrame(columns=['clf_name',metric_map.x_label.iloc[i],comp_name])
#
for r in range(0, len(clf_name_df)):
long_df = long_df.append({'clf_name':clf_name_df.iloc[r],
metric_map.x_label.iloc[i]: test_metric_df.iloc[r],
comp_name:'test_set'},ignore_index=True)
long_df = long_df.append({'clf_name':clf_name_df.iloc[r],
metric_map.x_label.iloc[i]: val_metric_df.iloc[r],
comp_name:'val_set'}, ignore_index=True)
# use seaborn's catplot() to draw performance from test-set and val-set
# in groups of classifier
g = sns.catplot(
data=long_df, kind="bar",
y="clf_name", x=metric_map.x_label.iloc[i], hue=comp_name, orient='h',
order=long_df[long_df['data_source'].str.match('val_set')].\
sort_values(by=metric_map.x_label.iloc[i],ascending=False)['clf_name']
)
g.set(xlim=(0.0, 1.0))
g.fig.set_figwidth(10)
g.fig.suptitle(f'sampler {sam_name}')
# create a dataframe of val_metric and scorer and col mappings that will be used
# to select result score from the validation results and test results
metric_map = pd.DataFrame(columns=['scorer','col','x_label'])
metric_map = metric_map.append({'scorer' : 'precision','col':'accuracy','x_label':'overall_accuracy'} , ignore_index=True)
metric_map = metric_map.append({'scorer' : 'recall','col':'Yes','x_label':'recall_yes'} , ignore_index=True)
metric_map = metric_map.append({'scorer' : 'recall','col':'No', 'x_label':'recall_no'} , ignore_index=True)
metric_map = metric_map.append({'scorer' : 'precision','col':'Yes','x_label':'precision_yes'} , ignore_index=True)
metric_map = metric_map.append({'scorer' : 'precision','col':'No','x_label':'precision_no'} , ignore_index=True)
metric_map = metric_map.append({'scorer' : 'f1-score','col':'Yes', 'x_label':'f1_yes'} , ignore_index=True)
metric_map = metric_map.append({'scorer' : 'f1-score','col':'No', 'x_label':'f1_no'} , ignore_index=True)
metric_map
# create a series of the classifier names
clf_name_df = sm_multi_clf_test_report_df1.clf_name[sm_multi_clf_test_report_df1['scorer'].str.match('precision')].reset_index(drop=True)
clf_name_df
comp_name = 'data-source'
print('Comparisons for df and sm:\n')
fun_split_val_test_comparison4('SMOTE',clf_name_df,sm_multi_clf_val_report_df,sm_multi_clf_test_report_df,'data_source')
print('Comparisons for df1 and sm:\n')
fun_split_val_test_comparison4('SMOTE',clf_name_df,sm_multi_clf_val_report_df1,sm_multi_clf_test_report_df1,'data_source')
print('Comparisons for df2 and smk3:\n')
fun_split_val_test_comparison4('SMOTE_K3',clf_name_df,smk3_multi_clf_val_report_df2,smk3_multi_clf_test_report_df2,'data_source')
# define an "all tasks" function
def fun_split_val_multi_clf_all_tasks(sam_name,clf_list, clf_name_list,X_train,y_train,X_val,y_val,X_test,y_test,comp_name):
# func for fitting multiple models on the train set, predicting on the validation and test sets, and reporting the results
multi_clf_results_df = fun_split_val_multi_clf(clf_list, clf_name_list,X_train,y_train,X_val,y_val,X_test,y_test)
# combine all the validation reports and combine the test reports for each classifier
multi_clf_val_report_df, multi_clf_test_report_df = fun_multi_clf_reports(multi_clf_results_df)
# get a list of the classifier names
clf_name_df = multi_clf_test_report_df.clf_name[multi_clf_test_report_df['scorer'].str.match('precision')].reset_index(drop=True)
# generate tables and graphs comparing validation and test performance
fun_split_val_test_comparison4(sam_name,clf_name_df,multi_clf_val_report_df,multi_clf_test_report_df,comp_name)
Now that we have all functions defined, we can see how different classifiers perform. We will test the classifiers grouped in lists earlier in the notebook.
Logistic regression list
# logistic regression list on original df
fun_split_val_multi_clf_all_tasks('SMOTE',lr_list,lr_name_list,X_train_sm,y_train_sm,X_val,y_val,X_test,y_test,'data_source')
# Repeat it for CD_imb1
fun_split_val_multi_clf_all_tasks('SMOTE',lr_list,lr_name_list,X_train1_sm,y_train1_sm,X_val1,y_val1,X_test1,y_test1,'data_source')
# Repeat it for df2
fun_split_val_multi_clf_all_tasks('SMOTE_k3',lr_list,lr_name_list,X_train2_smk3,y_train2_smk3,X_val2,y_val2,X_test2,y_test2,'data_source')
Bagging list
# bagging list on original df
fun_split_val_multi_clf_all_tasks('SMOTE',bag_list,bag_name_list,X_train_sm,y_train_sm,X_val,y_val,X_test,y_test,'data_source')
# bagging list on df1
fun_split_val_multi_clf_all_tasks('SMOTE',bag_list,bag_name_list,X_train1_sm,y_train1_sm,X_val1,y_val1,X_test1,y_test1,'data_source')
# bagging list on df2
fun_split_val_multi_clf_all_tasks('SMOTE_k3',bag_list,bag_name_list,X_train2_smk3,y_train2_smk3,X_val2,y_val2,X_test2,y_test2,'data_source')
Boosting list
# boosting list on original df
fun_split_val_multi_clf_all_tasks('SMOTE',boost_list,boost_name_list,X_train_sm,y_train_sm,X_val,y_val,X_test,y_test,'data_source')
# boosting list on df1
fun_split_val_multi_clf_all_tasks('SMOTE_k3',boost_list,boost_name_list,X_train2_smk3,y_train2_smk3,X_val2,y_val2,X_test2,y_test2,'data_source')
# boosting list on df2
fun_split_val_multi_clf_all_tasks('SMOTE',rf_list,rf_name_list,X_train_sm,y_train_sm,X_val,y_val,X_test,y_test,'data_source')
Random forest list
# logistic regression list original df
fun_split_val_multi_clf_all_tasks('SMOTE',rf_list,rf_name_list,X_train_sm,y_train_sm,X_val,y_val,X_test,y_test,'data_source')
# logistic regression list on df1
fun_split_val_multi_clf_all_tasks('SMOTE',rf_list,rf_name_list,X_train1_sm,y_train1_sm,X_val1,y_val1,X_test1,y_test1,'data_source')
# logistic regression list on df2
fun_split_val_multi_clf_all_tasks('SMOTE_k3',rf_list,rf_name_list,X_train2_smk3,y_train2_smk3,X_val2,y_val2,X_test2,y_test2,'data_source')
We can now run the "all tasks" function for multiple oversampling objects, undersampling objects, and objects that combine the two sampling strategies.
# create a new version of the "all tasks" function that evaluates different resampling techniques
def fun_multi_clf_sampler_all_tasks(sampler_list,sampler_name_list,clf_list, clf_name_list,X_train,y_train,X_val, y_val,X_test,y_test,comp_name):
for i in range(0, len(sampler_list)):
# assign each sampler, and its name to a variable
sam = sampler_list[i]
sam_name = sampler_name_list[i]
# perform the resampling on the training set
X_train_sam, y_train_sam = sam.fit_resample(X_train, y_train)
# display the value counts after resampling
print('Sampler: ',sam,'\n')
print('y value counts of resampled train set\n',pd.Series(y_train_sam).value_counts(),'\n')
# func for fitting multiple models on the train set, predicting on the validation and test sets, and reporting the results
multi_clf_results_df = fun_split_val_multi_clf(clf_list, clf_name_list,X_train_sam,y_train_sam,X_val,y_val,X_test,y_test)
# combine all the validation reports and combine the test reports for each classifier
multi_clf_val_report_df, multi_clf_test_report_df = fun_multi_clf_reports(multi_clf_results_df)
# get a list of the classifier names
clf_name_df = multi_clf_test_report_df.clf_name[multi_clf_test_report_df['scorer'].str.match('precision')].reset_index(drop=True)
# generate tables and graphs comparing validation and test performance
fun_split_val_test_comparison4(sam_name,clf_name_df,multi_clf_val_report_df,multi_clf_test_report_df,comp_name)
Evaluating oversampling techniques on the three datasets
# Evaluate oversampling techniques on original df
print('For df:\n')
fun_multi_clf_sampler_all_tasks(osampler_list,osampler_name_list,clf_list4,clf_name_list4,X_train.values,y_train,X_val,y_val,X_test,y_test,'data_source')
# Evaluate oversampling techniques on df1
print('For df1:\n')
fun_multi_clf_sampler_all_tasks(osampler_list,osampler_name_list,clf_list4,clf_name_list4,X_train1.values,y_train1,X_val1,y_val1,X_test1,y_test1,'data_source')
# Evaluate oversampling techniques on df2
print('For df2:\n')
fun_multi_clf_sampler_all_tasks(osampler_list,osampler_name_list,clf_list4,clf_name_list4,X_train2.values,y_train2,X_val2,y_val2,X_test2,y_test2,'data_source')
Evaluating undersampling techniques on the three datasets
# Evaluate undersampling techniques on original df
print('For df:\n')
fun_multi_clf_sampler_all_tasks(usampler_list,usampler_name_list,clf_list4,clf_name_list4,X_train.values,y_train,X_val,y_val,X_test,y_test,'data_source')
# Evaluate undersampling techniques on df1
print('For df1:\n')
fun_multi_clf_sampler_all_tasks(usampler_list,usampler_name_list,clf_list4,clf_name_list4,X_train1.values,y_train1,X_val1,y_val1,X_test1,y_test1,'data_source')
# Evaluate undersampling techniques on df2
print('For df2:\n')
fun_multi_clf_sampler_all_tasks(usampler_list,usampler_name_list,clf_list4,clf_name_list4,X_train2.values,y_train2,X_val2,y_val2,X_test2,y_test2,'data_source')
Evaluating techniques that combine oversampling and undersampling
# Evaluate combined resampling techniques on original df
print('For df:\n')
fun_multi_clf_sampler_all_tasks(csampler_list,csampler_name_list,clf_list4,clf_name_list4,X_train.values,y_train,X_val,y_val,X_test,y_test,'data_source')
# Evaluate combined resampling techniques on df1
print('For df1:\n')
fun_multi_clf_sampler_all_tasks(csampler_list,csampler_name_list,clf_list4,clf_name_list4,X_train1.values,y_train1,X_val1,y_val1,X_test1,y_test1,'data_source')
# Evaluate combined resampling techniques on df2
print('For df2:\n')
fun_multi_clf_sampler_all_tasks(csampler_list,csampler_name_list,clf_list4,clf_name_list4,X_train2.values,y_train2,X_val2,y_val2,X_test2,y_test2,'data_source')
Balanced ensemble classifiers
Note the data is automatically balanced with these classifiers, so the function will have "None" for the resampler parameter.
# Evaluate balanced ensemble classifiers on original df
print('For df:\n')
fun_split_val_multi_clf_all_tasks('None',be_clf_list, be_clf_name_list,X_train,y_train,X_val,y_val,X_test,y_test,'data_source')
# Evaluate balanced ensemble classifiers on df1
print('For df1:\n')
fun_split_val_multi_clf_all_tasks('None',be_clf_list, be_clf_name_list,X_train1,y_train1,X_val1,y_val1,X_test1,y_test1,'data_source')
# Evaluate balanced ensemble classifiers on df2
print('For df2:\n')
fun_split_val_multi_clf_all_tasks('None',be_clf_list, be_clf_name_list,X_train2,y_train2,X_val2,y_val2,X_test2,y_test2,'data_source')
When selecting the best combination of sampling method and classifier from above, the key to remember is that we care most about minimizing type II error for the "Yes" class. In other words we want to minimize the number of actual lemons that the model incorrectly classifies as non-lemons. Granted, it is important be mindful of the tradeoffs. The random forest (rf_max) classifier combined with the "NearMiss2" under sampling technique provides reasonable results for recall on the "Yes" class. But the tradeoff for precision should be be noted.
!cp "/content/drive/MyDrive/Colab Notebooks/car-auction-resampling.ipynb" ./
!jupyter nbconvert --to html "car-auction-resampling.ipynb"